Yael Ohayon - 312542558
Use one or more of the clustering algorithms we discussed in class to cluster together artists based on similarities. Usually, we use unsupervised learning in the earlier stages of the project. Discuss the results and support your claim in at least one plot (in addition to the clustering plot) This graph may relate to your predictions or incorporate any information from an outside source (please mention explicitly any source you used as your help).
In the follwing pargraph I will explain what I did in order to clustering the data
Stage I: Pre-Processing:
As also explained in requirements I did some pre-processing which is neccery to analyze the data later. For begining I used loop iteration over all images given in train and validation folders and using the python package for image processing - openCV,I resized all images to 100 * 100 size, so from now on - all the analyze made over picture of same dimension.The function I used can be found in this notebook at last.
Later In order to answer the question above I choose to cluster the data according to the given label of artists origin
How I did it?
First because the requirements asked to plot graphs - I knew I have to use 2 or 3 dimensions - because more than that is not possible for plotting. So - I needed to reduce the image direction, which although was minimized to 1001003 (because each pixel holds 3 color) It still alot!
I read about image processing and dimension reduction and desided to use PCA (Principal Componenet Analysis) in order to reduce image dimension.
Why PCA?
PCA create a new axis - which is linear combination of other features that explain most of data variance . Mathematiclly we search after the combintaion that project over it, as new axis, will give us the highest variacne of original data compare to all other projection.
Pay attention - Due to the high cost (in time and memory) of processing the data I created all tables as numpy or pandas object (pickles) that are already in this folder and loaded for anlyzing (but not created here) - in order to check the code you can run the comment lines - the data is genereted by this code.
import sys
sys.path.append("../")
import cv2
import imutils
from PIL import Image
import cv2
import os
os.getcwd()
import numpy as np
from plotly.subplots import make_subplots
import pandas as pd
import matplotlib
from sklearn.decomposition import PCA
import plotly.graph_objects as go
#rootdir = r'E:\Surface\ML and Apllication in ECO\Final\Archive\training\training'
#val_root = r'E:\Surface\ML and Apllication in ECO\Final\Archive\validation\validation'
"""This functio create two python dict that match artist to country and country to artist in
order for data processing later"""
def dict_createion():
photo_dict = {}
artists = [dI for dI in os.listdir(rootdir) if os.path.isdir(os.path.join(rootdir, dI))]
artist_dict = dict((ar, []) for ar in artists)
for subdir, dirs, files in os.walk(rootdir):
for file in files:
artist = (os.path.basename(os.path.normpath(subdir)))
photo_dict[file] = artist
artist_dict[artist].append(file)
## CC - Gilad Green - IML ##
def plot_principal_component(pca, i):
# Get PC representation as a subspace with size proportional to the corresponding singular value
size = np.sqrt(pca.singular_values_[i])
pc = np.outer(pca.components_[i], np.array([-1,1])) * size
return go.Scatter3d(x=pc[0], y=pc[1], z=pc[2], mode="lines", opacity=.5,
line=dict(color=color_scheme[i], width=2*size), name='PC {}'.format(i+1))
"""This function loads data that created in the function
ImageLoop given in the bottom of the notebook
X_train.npy - is table of 3988 rows, row for each smaple\image\picture given
in kaggle folder "train" and 30000 features - 100*100*3
y_train - is table of 3988 rows - represent the artist of each picture (corrspond to X_train)
y_country - is table of 3988 rows - represent the artist origin by country (correspod to X_train)
all other tables are created in the same manner
val created from the val folder given by kaggle while tets is randomly spliting from test
"""
from sklearn.model_selection import train_test_split
X = np.load("X_train.npy")
y = np.load("y_train.npy")
y_country = np.load("y_country.npy")
X_val = np.load("X_val.npy")
y_val = np.load("y_val.npy")
y_country_val = np.load("X_train.npy")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
## CC - Gilad Green - IML ##
import plotly.express as px
from sklearn.neighbors import KNeighborsClassifier
color_scheme = ["rgb(189,6,96)", "rgb(6,189,99)", "rgb(6,96,189)"]
pca_model = PCA(n_components=3).fit(X)
X_projected = pca_model.transform(X)
def ORG_PCA(X,y):
# fig = go.Figure(data = [go.Scatter3d(x = X_projected[:, 0], y=X_projected[:, 1], z=X_projected[:, 2], opacity = 0.75,
# mode = 'markers', marker=dict(size=2, color=y), showlegend=False)],
fig = px.scatter_3d(x=X_projected[:, 0], y=X_projected[:, 1], z=X_projected[:, 2], color = y ,size_max=1)
fig.update_layout(scene_aspectmode="cube",title=r"$\text{(1) Projection Onto PCA Subspace}$",
scene_xaxis_title="PC1",
scene_yaxis_title="PC2",
scene_zaxis_title="PC3")
fig.update_traces(marker=dict(size=3,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
def PCA_eval():
ev = pca_model.singular_values_ ** 2
df = pd.DataFrame(np.array([ev, ev / sum(ev), pca_model.explained_variance_ratio_]),
columns=["PC 1", "PC 2", "PC3"],
index=["Eigenvalues", "Explained Variance", "sklearn's Explained Variance"])
sumi = np.around(np.sum(pca_model.explained_variance_ratio_),decimals=2)
variance = list(100 * np.around(pca_model.explained_variance_ratio_,decimals= 2)) + [100*sumi]
fig = make_subplots(rows=1, cols=2,
subplot_titles=[r"$\text{Eigenvalues}$", r"$\text{Cumulative Explained Variance}$"],
specs=[[{'type': 'Bar'}, {'type': 'Waterfall'}]])
fig.add_traces([go.Bar(x=['PC1', 'PC2', 'PC3'], y=pca_model.singular_values_, marker_color=color_scheme),
go.Waterfall(x=["PC1", "PC2", "PC3", "Total"],
y=variance,
text=[f"{v}%" for v in variance],
textposition="outside",
totals={"marker": {"color": "black"}},
measure=["relative", "relative", "relative", "total"])],
rows=[1, 1], cols=[1, 2])
fig.add_shape(type="rect", xref="x", yref="y", x0=-0.4, x1=0.4, y0=0.0, y1=fig.data[1].y[0],
fillcolor=color_scheme[0], line=dict(color=color_scheme[0]), opacity=1, row=1, col=2)
fig.add_shape(type="rect", xref="x", yref="y", x0=0.6, x1=1.4, y0=fig.data[1].y[0],
y1=fig.data[1].y[0] + fig.data[1].y[1],
fillcolor=color_scheme[1], line=dict(color=color_scheme[1]), opacity=1, row=1, col=2)
fig.add_shape(type="rect", xref="x", yref="y", x0=1.6, x1=2.4, y0=fig.data[1].y[0] + fig.data[1].y[1],
y1=fig.data[1].y[0] + fig.data[1].y[1] + fig.data[1].y[2],
fillcolor=color_scheme[2], line=dict(color=color_scheme[2]), opacity=1, row=1, col=2)
fig.update_layout(showlegend=False, title=r"$\text{(2) PCA Explained Variance}$", margin=dict(t=100))
fig.show()
ORG_PCA(X,y_country)
We should think and evalute out new sub-space - how good those PC? in the manner of - how well projection over it described the data well (how much of the variance it keeps?)
As we learned and discussed in class the proporation between the eigenvalue and sum of all eigenvalue of the features matrix - gives as the proporatinal variance its explaine. We can see the absolute value of the eigenvalue and the proporation in the second graph - (2) PCA explained variacne.
In out data our top 3 PC explain in total 45% of data variance, that's cool! instead of 1001003 features that describe 100% of pic - we need only 3 to decsribe 45% of variance.
In the next graph we evalute PCA reduction for the next mission - cheking out how many feature to use for data classification, when plotting is not needed
PCA_eval()
So next we need do decide - how much number of componenets to use? Well, we know that adding componenet will make analyze harder, so we want to reduce the number of components but on the other hand to keep explained variance high as we can.
The following code check component number from 1 to 200 in jump of 5, As we can see 100 components give us 80% variance explain and 200 components give us only 85% variance explain - It's mean that adding 100 components add only 5% variance explanation.
So - it the next steps we will analyze the data using only 100 componenets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.datasets import make_moons
cmp = list(range(1,200,5))
var_explained = [0.32252795374875093, 0.5473577325010345, 0.6090863546620551, 0.6474343496783221, 0.673818456998201, 0.6933456294213315, 0.7092786280719136, 0.7224780051644024, 0.7337657229416796, 0.743654156105565, 0.7523683178156403, 0.7600835412925967, 0.7670892067807604, 0.7734635045939114, 0.7793024587087803, 0.7847118175294681, 0.7897868575616117, 0.7945211201723869, 0.7990880405200035, 0.8032899778113859, 0.8072921529337342, 0.811031614930826, 0.8145794223657139, 0.8179886879670624, 0.8212279646507146, 0.8242397497509685, 0.8272007624386497, 0.8300208615509651, 0.8327522757355716, 0.8353723555409102, 0.8378770394806213, 0.8403368064905449, 0.8427143753393262, 0.845006529344865, 0.8472000978232513, 0.8493613216003854, 0.8514427063930443, 0.8534826349548814, 0.8554147167894187, 0.8573499438408055]
# var_explained = []
# for i in cmp:
# pca_model = PCA(n_components=i).fit(X_train)
# var_explained.append(sum(pca_model.explained_variance_ratio_))
fig = go.Figure([
go.Scatter(name='Variance Explained in PCA model', x=cmp, y=var_explained, mode='markers+lines', marker_color='rgb(152,171,150)')
]).update_layout(title=r"$\text{(3) Explained Variance as function of number of components in PCA Model}$",
xaxis_title=r"$n \text{ - Number of Components}$",
yaxis_title=r"$\text{Variance Explained}$").show()
So, what's next - next we will use the top two components in order to cluster the data using KNN!
Due to the fact KNN is a-parametric model, we will use whole X and whole y (and not train, test and validation as we will use in the next mission)
we will do that with diffrent neighbers counting - in order to get the best score and to get sense of how the number of neighbers change the clustering map.
## Clustring - KNN ##
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from sklearn import preprocessing
import seaborn as sns
import matplotlib.patches as mpatches
neighbors = [2,5,7,10,15,20]
le_dict= {'French':0, 'US':1, 'Dutch':2}
y_tag = [le_dict[x] for x in y_country]
h = 2
for n in neighbors:
cmap_light = ListedColormap(['orange', 'cyan', 'cornflowerblue'])
cmap_bold = ['darkorange', 'c', 'darkblue']
proj = X_projected[:,:2]
model = KNeighborsClassifier(n_neighbors=n, weights = 'distance')
model.fit(proj, y_tag)
x_min, x_max = proj[:, 0].min() - 1, proj[:, 0].max() + 1
y_min, y_max = proj[:, 1].min() - 1, proj[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(6, 4))
plt.contourf(xx, yy, Z, cmap=cmap_light)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
fre_patch = mpatches.Patch(color='orange', label='French')
US_patch = mpatches.Patch(color='cyan', label='US')
Dutch_patch = mpatches.Patch(color='cornflowerblue', label='Dutch')
plt.legend(handles=[Dutch_patch,US_patch,fre_patch])
plt.title("KNN using " + str(n) + " Neighbers")
plt.show()
So - what can we learn from above clustering?
First it's nice to see the Bias - Variance Trade Off, We can see that increaseing the number of neighbers make the Dutch classification less sagnificant (Bias raise, variance decresed), while when we use low number of neighbers became more significant.
This shows the bias - variance tradeoff for KNN due to the fact that using more neighbers increse bias and decreade variance and the opposite dot less neighbers.
We also see that french artist rule the centre of PC1 and PC2 whie US rule the edges - mostly the right-up cornet and left up corner.
We can see The dutch classification isn't clearly seperated of the French and US, we can point that we don't have many Dutch artists, actually we have only one, which might be not enough for this kind ot mission.
Morover I tried to figure out if we can classify artist by their dominant color, I tried to sum up the first 1/3 values of the weight in the component first and second vector - in order to get the "weight" over the red color, and to sum up the next 1/3 values to get the weight for the green and blue values (next 1/3 for green and next 1/3 for blue)
we can see that there is no most dominant color (at least for two major component we use), unfortunately, it doesn't helped much!
print("Sum of Red weight in PCA first componet: ",sum(pca_model.components_[0][:1000])),2
print("Sum of Green weight in PCA first componet: ",sum(pca_model.components_[0][1000:2000]))
print("Sum of Blue weight in PCA first componet: ",sum(pca_model.components_[0][2000:3000]))
Sum of Red weight in PCA first componet: 7.030287284464242 Sum of Green weight in PCA first componet: 7.060508860728426 Sum of Blue weight in PCA first componet: 6.996582924466964
In this part we will use features matrix we get from projection over top 400 componenets of PCA.
pca_model = PCA(n_components=400).fit(X)
X_train_projected = pca_model.transform(X_train)
X_test_projected = pca_model.transform(X_test)
X_val_projected = pca_model.transform(X_val)
Build two classification models in order to predict the painter from the paint. The goal here is to make a good prediction. Please include explanations on the process of developing your models. Be as clear and descriptive as you can be.
So, I thought it could be nice just trying multiple models for classification and check out which is the best, By now I will just try to get some sense of how it's look like and next I will choose two of the models and using validation data set I will try to imrove them. So I tried several models that I searched for in - https://scikit-learn.org/stable/supervised_learning.html, my attempts can be viewed in the last section of this notebook
Model Descripation:
Random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.
I choose this model because trees calssifier tend to "break" down the data in a greedy way that maximize score over train set, I thought I could be usefull and fit image processing.
Moreover I thought that due to the fact random forest use kind of "bagging" in a way it make de-correlation over data while growing the tree (it's choose the k cordinate randomly) could help us with this specific data- seems to by very correlated.
Present the tuning process. Alongside your description, add a table with hyper-parameters and their corresponding accuracies on the training and CV datasets, ordered by the CV accuracy in a decreasing order. Show only the best 15 combination; that is, the table should consist 15 rows max.
In the table above we can see all the tuning parameters I mentioned and both train, test and validatio score. We should pay attention that we "cropped" the table - and kept only 15 lined with best validation.
We can see that there is several combination gives best prediction over validation data set. for those combination we can point on kind of trade - off between tree -depth and number of estimators and min samples split in the manner that we can get the best result also by increasing tree depth (from 9 to 17/19) and also incresing the number of estimators and min samples split. It makes sense because this parameters have opposite effect on the bias-variance trade off, so in total it keeps the result quite the same.
So In order to evalute the preformenct of the Random Forest Classifier I foucs first on tuning several of hyper- parameters, I created a table of all combination of all hyper parameter and for evry combination checked the train, validation and test score. The parameter I choose to to use in the tuning process are those I find that can be tuned in the sklearn documantaion can be found here: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
max depth - The maximum depth of the tree. We know this affect bias-variacne Trade-off in trees high depth -> low bias and high variance.
number of estimators - The number of trees in the forest.
min samples split - The minimum number of samples required to split an internal node.
min samples leaf - The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.
max samples - If bootstrap is True, the number of samples to draw from X to train each base estimator.
I choose this parameters because this are the main parameter of the model that can get diffrent values - and we can try tune them. For every one of the parameters the range I used considered also the default value by sklearn package.
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
depth = list(range(1,20,2))
nesti = list(range(10,300,20))
mss = list(range(2,100,20))
msl = list(range(1,100,20))
train_errors = []
val_errors = []
test_errors = []
"""Code generated Pickle: """
# df = pd.DataFrame(columns=['tree depth', 'number of estimators', 'min samples split',
# 'min samples leaf','train score', 'validation score', 'test score'])
# for x in itertools.product(depth, nesti,mss,msl):
# model =RandomForestClassifier(max_depth=x[0],n_estimators=x[1],min_samples_split=x[2], min_samples_leaf =x[3],random_state=42)
# model.fit(X_train_projected,y_train)
# df = df.append({'tree depth': x[0], 'number of estimators':x[1], 'min samples split': x[2],
# 'min samples leaf': x[3],'train score': model.score(X_train_projected, y_train),
# 'validation score':model.score(X_val_projected, y_val),'test score':model.score(X_test_projected, y_test)}, ignore_index=True)
# df = df.sort_values(by='validation score', ascending=False)
# df.to_pickle("Random_forest.pkl")
df = pd.read_pickle("Random_forest.pkl")
df =df.sort_values(by='validation score', ascending=False)
df.head(15)
| tree depth | number of estimators | min samples split | min samples leaf | train score | validation score | test score | |
|---|---|---|---|---|---|---|---|
| 1651 | 9.0 | 130.0 | 2.0 | 21.0 | 0.738445 | 0.311111 | 0.319131 |
| 1656 | 9.0 | 130.0 | 22.0 | 21.0 | 0.738445 | 0.311111 | 0.319131 |
| 1661 | 9.0 | 130.0 | 42.0 | 21.0 | 0.738445 | 0.311111 | 0.319131 |
| 3705 | 19.0 | 270.0 | 22.0 | 1.0 | 0.991759 | 0.311111 | 0.322473 |
| 3360 | 17.0 | 290.0 | 42.0 | 1.0 | 0.938732 | 0.311111 | 0.340852 |
| 3730 | 19.0 | 290.0 | 22.0 | 1.0 | 0.991401 | 0.310101 | 0.325815 |
| 2835 | 15.0 | 170.0 | 42.0 | 1.0 | 0.913293 | 0.310101 | 0.321637 |
| 3310 | 17.0 | 250.0 | 42.0 | 1.0 | 0.936582 | 0.309091 | 0.340852 |
| 3596 | 19.0 | 170.0 | 82.0 | 21.0 | 0.712648 | 0.309091 | 0.311612 |
| 2920 | 15.0 | 230.0 | 82.0 | 1.0 | 0.772841 | 0.309091 | 0.309942 |
| 3670 | 19.0 | 230.0 | 82.0 | 1.0 | 0.787531 | 0.308081 | 0.309942 |
| 1801 | 9.0 | 250.0 | 2.0 | 21.0 | 0.756001 | 0.308081 | 0.314119 |
| 1715 | 9.0 | 170.0 | 62.0 | 1.0 | 0.676460 | 0.308081 | 0.312448 |
| 3655 | 19.0 | 230.0 | 22.0 | 1.0 | 0.991043 | 0.308081 | 0.322473 |
| 3271 | 17.0 | 210.0 | 82.0 | 21.0 | 0.714081 | 0.308081 | 0.308271 |
Evaluate your performance using the tools from the class.
We will evaliute our classification using ROC curve, Pay attention it's a bit more complex using multiple classes.
https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5
We can learn from the result that over Test That although we get poor results for both classifiation and test, using TPR and FPR methods
But, pay attention to big diffrent in the ROC curve compare to next evalutaion I show, ROC uses the Probability to get specific label - it's not "binary", it count also how much out algorithm is "close" to true value. For Example - it might be that we predict image wrong, because the wrong artist got higher probability, but it was only a bit more then the correct artists probability, so we want to take this "close" to correct answer in acount!
So - In manner of ROC curve, we aren't that bad! (Although from the binary point of view we are...) Because there is many lines (ROC curve for each artist - we will use AUC parameter - because this parameter measure of the ability of a classifier to distinguish between classes (here - diffrent artists)
from sklearn.metrics import roc_curve, auc
from sklearn import datasets, metrics
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from itertools import cycle
artirst = ['Cezanne','Degas','Gauguin','Hassam','Matisse','Monet','Pissarro','Renoir','Sargent','VanGogh']
model =RandomForestClassifier(max_depth=9,n_estimators=130,min_samples_split=2, min_samples_leaf =21,random_state=42)
model.fit(X_train_projected,y_train)
y_pred = model.predict(X_test_projected)
y_score = model.predict_proba(X_test_projected)
y_test_bin = label_binarize(y_test, classes=artirst)
n_classes = y_test_bin.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i,color in zip(range(n_classes), colors):
fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
plt.plot(fpr[i], tpr[i], color=color, lw=2)
print('AUC for Class {}: {}'.format(artirst[i], auc(fpr[i], tpr[i])))
# fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
# plt.plot(fpr[i], tpr[i], color='darkorange', lw=2)
# print('AUC for Class {}: {}'.format(i+1, auc(fpr[i], tpr[i])))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Random Forest')
plt.rcParams["figure.figsize"] = (15,10)
plt.show()
AUC for Class Cezanne: 0.7499858669229464 AUC for Class Degas: 0.7427034973692356 AUC for Class Gauguin: 0.7490280051449953 AUC for Class Hassam: 0.7533654079797832 AUC for Class Matisse: 0.8122981956315288 AUC for Class Monet: 0.8093897140088719 AUC for Class Pissarro: 0.7746955233706385 AUC for Class Renoir: 0.7228987497289874 AUC for Class Sargent: 0.7823332231007425 AUC for Class VanGogh: 0.7286865671641792
Explore your predictions. Which paintings were misclassified? Why?
Load X_test and y_test and test your model. -- I talked with David and used splitting train folder given by kaggle into test and train insted.
In order to answer that question I will use the best model given by the table above, using:
We can see that the artist that the model had the best classification over their artwork are Matisse, Monet and Renoir. The artist we had low sucess in classification over it are Degas,Hassam and Cezanne.
It's intresting to understand why those specific artist are missclassified, In order to do that we can search after the tree greedy steps in order to see what happend and in which final "box" the artists appear. This will be very hard,so another option that will get the same effect ist to check the Probability to get one of the low misclassificatied artits and to get sense why the algorithm "confused" about it. This what I will try to do next (I gave only one of the artists exapmle- because it's similar idea for everyone else)
total = {'Cezanne': 0, 'Degas': 0, 'Gauguin': 0,
'Hassam': 0, 'Matisse': 0, 'Monet': 0,
'Pissarro': 0, 'Renoir': 0, 'Sargent': 0,
'VanGogh': 0}
score = {'Cezanne': 0, 'Degas': 0, 'Gauguin': 0,
'Hassam': 0, 'Matisse': 0, 'Monet': 0,
'Pissarro': 0, 'Renoir': 0, 'Sargent': 0,
'VanGogh': 0}
for i in range(len(y_test)):
if y_pred[i] == y_test[i]:
score[y_test[i]]+=1
total[y_test[i]] +=1
for key in total:
print ("Ratio of Success over tets for: " + str(key) + " "+ str(np.round(score[key]/total[key]*100,2)) +"%")
Ratio of Success over tets for: Cezanne 12.78% Ratio of Success over tets for: Degas 7.5% Ratio of Success over tets for: Gauguin 24.22% Ratio of Success over tets for: Hassam 17.54% Ratio of Success over tets for: Matisse 53.85% Ratio of Success over tets for: Monet 48.78% Ratio of Success over tets for: Pissarro 41.07% Ratio of Success over tets for: Renoir 44.55% Ratio of Success over tets for: Sargent 26.61% Ratio of Success over tets for: VanGogh 24.0%
Degas_index = np.where(y_train == 'Degas')
X_train_degas = X_train_projected[Degas_index]
degas_pred = model.predict_proba(X_train_degas)
sum_per_artists = np.around(degas_pred.sum(axis=0)/len(degas_pred)*100,2)
i=0
for key in total:
print("Probability to classify as " + str(key) + " while it's Degas artwork " + str(sum_per_artists[i]) +"%")
i+=1
Probability to classify as Cezanne while it's Degas artwork 8.97% Probability to classify as Degas while it's Degas artwork 14.32% Probability to classify as Gauguin while it's Degas artwork 10.2% Probability to classify as Hassam while it's Degas artwork 8.96% Probability to classify as Matisse while it's Degas artwork 8.69% Probability to classify as Monet while it's Degas artwork 8.83% Probability to classify as Pissarro while it's Degas artwork 9.87% Probability to classify as Renoir while it's Degas artwork 11.41% Probability to classify as Sargent while it's Degas artwork 9.65% Probability to classify as VanGogh while it's Degas artwork 9.11%
Model Descripation: Generlly, SVM is an algorithm creates a line or a hyperplane which separates the data into classes. The C parameter tells the SVM optimization how much you want to avoid misclassifying each training example. For large values of C, the optimization will choose a smaller-margin hyperplane if that hyperplane does a better job of getting all the training points classified correctly. Conversely, a very small value of C will cause the optimizer to look for a larger-margin separating hyperplane, even if that hyperplane misclassifies more points. For very tiny values of C, you should get misclassified examples, often even if your training data is linearly separable.
Present the tuning process. Alongside your description, add a table with hyper-parameters and their corresponding accuracies on the training and CV datasets, ordered by the CV accuracy in a decreasing order. Show only the best 15 combination; that is, the table should consist 15 rows max.
Just as before - we tried to improve classification by tuning several parameters, This are the non-boolean parameters of the model
C - Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty.
kernel - Specifies the kernel type to be used in the algorithm.
degree - Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
## QDA AND LDA NOT POSSIBLE DUE TO Variables are collinear
# C = [1,2,3]
# kernel = ["linear", "poly", "rbf", "sigmoid"]
# degree = [1,2,3,4,5,6,7,8,9,10]
# df = pd.DataFrame(columns=['C','kernel','degree','train score', 'validation score', 'test score'])
# for x in itertools.product(C, kernel,degree):
# model = make_pipeline(StandardScaler(), SVC(gamma='auto', C=x[0],kernel=x[1],degree=x[2]))
# model.fit(X_train_projected,y_train)
# df = df.append({"C": x[0],"kernel": x[1],"degree": x[2], 'train score': model.score(X_train_projected, y_train),
# 'validation score':model.score(X_val_projected, y_val),'test score':model.score(X_test_projected, y_test)}, ignore_index=True)
# df = df.sort_values(by='validation score', ascending=False)
# df.to_pickle("SVC.pkl")
df = pd.read_pickle("SVC.pkl")
df.head(15)
| C | kernel | degree | train score | validation score | test score | |
|---|---|---|---|---|---|---|
| 27 | 1 | rbf | 8 | 0.903260 | 0.315152 | 0.302423 |
| 20 | 1 | rbf | 1 | 0.903260 | 0.315152 | 0.302423 |
| 22 | 1 | rbf | 3 | 0.903260 | 0.315152 | 0.302423 |
| 23 | 1 | rbf | 4 | 0.903260 | 0.315152 | 0.302423 |
| 24 | 1 | rbf | 5 | 0.903260 | 0.315152 | 0.302423 |
| 25 | 1 | rbf | 6 | 0.903260 | 0.315152 | 0.302423 |
| 26 | 1 | rbf | 7 | 0.903260 | 0.315152 | 0.302423 |
| 28 | 1 | rbf | 9 | 0.903260 | 0.315152 | 0.302423 |
| 29 | 1 | rbf | 10 | 0.903260 | 0.315152 | 0.302423 |
| 21 | 1 | rbf | 2 | 0.903260 | 0.315152 | 0.302423 |
| 68 | 2 | rbf | 9 | 0.968828 | 0.312121 | 0.309942 |
| 67 | 2 | rbf | 8 | 0.968828 | 0.312121 | 0.309942 |
| 66 | 2 | rbf | 7 | 0.968828 | 0.312121 | 0.309942 |
| 65 | 2 | rbf | 6 | 0.968828 | 0.312121 | 0.309942 |
| 64 | 2 | rbf | 5 | 0.968828 | 0.312121 | 0.309942 |
Evaluate your performance using the tools from the class.
Again we wil evaluter preformenct with ROC curve, we can see the values are quite similar to those of random forest.
from sklearn.metrics import roc_curve, auc
from sklearn import datasets, metrics
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from itertools import cycle
artirst = ['Cezanne','Degas','Gauguin','Hassam','Matisse','Monet','Pissarro','Renoir','Sargent','VanGogh']
model = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True))
model.fit(X_train_projected,y_train)
y_pred = model.predict(X_test_projected)
y_score = model.predict_proba(X_test_projected)
y_test_bin = label_binarize(y_test, classes=artirst)
n_classes = y_test_bin.shape[1]
fpr = dict()
tpr = dict()
roc_auc = dict()
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i,color in zip(range(n_classes), colors):
fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
plt.plot(fpr[i], tpr[i], color=color, lw=2)
print('AUC for Class {}: {}'.format(artirst[i], auc(fpr[i], tpr[i])))
# fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
# plt.plot(fpr[i], tpr[i], color='darkorange', lw=2)
# print('AUC for Class {}: {}'.format(i+1, auc(fpr[i], tpr[i])))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - SVM')
plt.rcParams["figure.figsize"] = (15,10)
plt.show()
AUC for Class Cezanne: 0.6845073209339139 AUC for Class Degas: 0.6913571649644072 AUC for Class Gauguin: 0.7137438610851262 AUC for Class Hassam: 0.7588245776028252 AUC for Class Matisse: 0.8037828426717317 AUC for Class Monet: 0.8077546138589877 AUC for Class Pissarro: 0.7473090849242923 AUC for Class Renoir: 0.7747976439979766 AUC for Class Sargent: 0.7709241499564079 AUC for Class VanGogh: 0.6829402985074626
Explore your predictions. Which paintings were misclassified? Why?
same as before - now only for prediction made by the SVM model.
Top artists score classification:
Top artists score classification:
We alredy saw, that the reson of that misclassification is similarity between diffrent artist works.
total = {'Cezanne': 0, 'Degas': 0, 'Gauguin': 0,
'Hassam': 0, 'Matisse': 0, 'Monet': 0,
'Pissarro': 0, 'Renoir': 0, 'Sargent': 0,
'VanGogh': 0}
score = {'Cezanne': 0, 'Degas': 0, 'Gauguin': 0,
'Hassam': 0, 'Matisse': 0, 'Monet': 0,
'Pissarro': 0, 'Renoir': 0, 'Sargent': 0,
'VanGogh': 0}
for i in range(len(y_test)):
if y_pred[i] == y_test[i]:
score[y_test[i]]+=1
total[y_test[i]] +=1
for key in total:
print ("Ratio of Success over tets for: " + str(key) + " "+ str(np.round(score[key]/total[key]*100,2)) +"%")
Ratio of Success over tets for: Cezanne 12.03% Ratio of Success over tets for: Degas 21.67% Ratio of Success over tets for: Gauguin 19.53% Ratio of Success over tets for: Hassam 18.42% Ratio of Success over tets for: Matisse 64.96% Ratio of Success over tets for: Monet 48.78% Ratio of Success over tets for: Pissarro 41.07% Ratio of Success over tets for: Renoir 36.63% Ratio of Success over tets for: Sargent 29.03% Ratio of Success over tets for: VanGogh 20.8%
Discuss the differences between your models, in their assumptions, and explain why did you choose them. Consider manipulating your data to see if it helps you achieve better results.
I used also this source - https://datascience.stackexchange.com/questions/6838/when-to-use-random-forest-over-svm-and-vice-versa
We alredy talked about each one of the model sepretly, described it and showed model paramerter tuning effect over data. For example, we talked about the bias-varince tradeoff and the paramertes affect this in each of the models (mainly tree depth in random forest and C regularizaion parameter in SVM) We also saw that the models are the same in the way they "view" the problem - due to two of them belong to the field of Supervised learning. I choosed this models because first, while trying many other models from SKLEARN - they did the best, Second, this models are "rich" of paramerter can be scaled - so in this kind of problem, I though it would be good. Moreover we might be not that suprised this models worked because, in some manner - they quite similar, and both of them "split" the subspace created by feature into "boxes" (Tree) or sub-subspaces(SVM). I tried couple of ideas in order to manupulate data -such as not scaling data, using grey-scale histogram or red, blue green histogram splitted, but, none of this ideas really works (example of histogram I used given in the bottom of the notebook) I guess that people who are expert in Image processing could have beteer ideas based on more complex theory
SVM VS RANDOM FOREST -
| Criteria | SVM | Random Forest |
|---|---|---|
| Fits multiclass problems? | No, we get probability from distance calculation | Yes |
| Data scaling? | Need | No need |
| Complexity? | High, due to n x n matrix | Low |
| When to use? | problem might not be linearly separable | handle large number of training example,handle non-linear data |
| What's common | Supervised | Supervised |
How does model performance? Discuss.
For both model we sae low results for validation and tests set, But if we investigate the result a bit more we can see, using ROC curve that in manner of probability - the classification isn't that bad, and although it's wrong in "botoom line" - it's quite close to corrct results.
Unfortunalty, We reached only ~31% rate of succee for validation and test sets in, both random forest ans SVM.
Because I tried many other models, eventhough it's not greate, it's better then other.
I think the main reson the models aren't very good is that is that some of artwork are quite the same - or very similar. Also, I think that maybe using thet data "as is" with no manipulation (beside resize, that this step only re-scale it but don't change it) maybe cause this low rate of success. Moreover I believe that maybe high level of applications in Deep Learning, that build specifly in order to image processing and classification can help us imporve Results.
"""
Other model trying
"""
from sklearn.linear_model import RidgeClassifier
alphas=[1e-4,1e-3, 1e-2, 1e-1, 1]
# df = pd.DataFrame(columns=['alpha','train score', 'validation score', 'test score'])
# for a in alphas:
# model =RidgeClassifier(alpha=a)
# model.fit(X_train_projected,y_train)
# df = df.append({'alpha': a,'train score': model.score(X_train_projected, y_train),
# 'validation score':model.score(X_val_projected, y_val),'test score':model.score(X_test_projected, y_test)}, ignore_index=True)
# df = df.sort_values(by='validation score', ascending=False)
# df.to_pickle("RidgeClassifier.pkl")
df = pd.read_pickle("RidgeClassifier.pkl")
df
| alpha | train score | validation score | test score | |
|---|---|---|---|---|
| 0 | 0.0001 | 0.555357 | 0.258586 | 0.265664 |
| 1 | 0.0010 | 0.555357 | 0.258586 | 0.265664 |
| 2 | 0.0100 | 0.555357 | 0.258586 | 0.265664 |
| 3 | 0.1000 | 0.555357 | 0.258586 | 0.265664 |
| 4 | 1.0000 | 0.555715 | 0.258586 | 0.264829 |
"""
The function used to resize Data to fixed form od 100*100*4
"""
def resize():
for subdir, dirs, files in os.walk(rootdir):
for file in files:
# print os.path.join(subdir, file)
filepath = os.path.join(subdir, file)
if os.path.isfile(filepath):
im = Image.open(filepath)
imResize = im.resize((100, 100), Image.ANTIALIAS)
rgb_im = imResize.convert('RGB')
rgb_im.save(filepath, "JPEG")
"""
This functin used to create feature matrix and labels matrix for train and validation data
given by kaggle
Pay attention test was created by diving train data (in trian folder of kaggle int)
"""
def imageLoop():
labels_dict_country = {'Cezanne': 'French', 'Degas': 'French', 'Gauguin': 'French',
'Hassam': 'US', 'Matisse': 'French', 'Monet': 'French',
'Pissarro': 'French', 'Renoir': 'French', 'Sargent': 'US',
'VanGogh': 'Dutch'}
for subdir, dirs, files in os.walk(rootdir):
for file in files:
filepath = os.path.join(subdir, file)
if os.path.isfile(filepath):
artist = (os.path.basename(os.path.normpath(subdir)))
label = labels_dict_country[artist]
image = cv2.imread(filepath)
pixel = image.flatten()
# hist = extract_color_histogram(image)
rawImages.append(pixel)
features.append(pixel / 256)
# another optin was features.append(hist) = gave much less
artists_labels.append(artist)
labels.append(label)
"""
Nice graph for random forest
"""
from sklearn.ensemble import RandomForestClassifier
depth = list(range(1,20,2))
train_errors = []
val_errors = []
test_errors = []
for d in depth:
model =RandomForestClassifier(max_depth=d, random_state=42)
model.fit(X_train_projected,y_train)
train_errors.append(model.score(X_train_projected, y_train))
val_errors.append(model.score(X_val_projected, y_val))
test_errors.append(model.score(X_test_projected, y_test))
fig = go.Figure([
go.Scatter(name='Train Score', x=depth, y=train_errors, mode='markers+lines', marker_color='rgb(152,171,150)'),
go.Scatter(name='Validation Score', x=depth, y=val_errors, mode='markers+lines', marker_color='rgb(220,179,144)'),
go.Scatter(name='Test Score', x=depth, y=test_errors, mode='markers+lines', marker_color='rgb(25,115,132)')]).update_layout(title=r"$\text{Random Forest Score over Train, Validation and Test Sets}$",
xaxis_title=r"$D\text{ - Tree Depth}$",
yaxis_title=r"$\text{Score Value}$").show()
def extract_color_histogram(img, bins=(8, 8, 8)):
# extract a 3D color histogram from the HSV color space using
# the supplied number of `bins` per channel
# hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
# hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
# [0, 180, 0, 256, 0, 256])
# print(hist.shape)
# # handle normalizing the histogram if we are using OpenCV 2.4.X
# if imutils.is_cv2():
# hist = cv2.normalize(hist)
# # otherwise, perform "in place" normalization in OpenCV 3 (I
# # personally hate the way this is done
# else:
# cv2.normalize(hist, hist)
color = ('b', 'g', 'r')
hist = []
for i, col in enumerate(color):
histr = cv2.calcHist([img], [i], None, [256], [0, 256])
hist.append(histr.flatten())
arr = np.array(hist).T.flatten()
# return the flattened histogram as the feature vector
# print(hist.flatten().shape)
# return hist.flatten()
return arr